Executive Summary

Import data and create numeric factors for non-numeric new categorical variables

attritdf<-read.csv(file="/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/CaseStudy2-data.csv",header=TRUE)
noattritdf <- read.csv(file="/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/CaseStudy2CompSetNoAttrition.csv",header=TRUE)
nosalarydf <- read.csv(file="/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/CaseStudy2CompSetNoSalary.csv",header=TRUE)

attritdf$AgeGrp<-cut(attritdf$Age,breaks = c(17,24,34,44,54,Inf),labels=c('18-24','25-34','35-44','46-54','55+'))
noattritdf$AgeGrp<-cut(noattritdf$Age,breaks = c(17,24,34,44,54,Inf),labels=c('18-24','25-34','35-44','46-54','55+'))

attritdf <- attritdf %>% mutate(AttrNum = case_when(
  Attrition == "Yes" ~ 1,
  Attrition == "No" ~ 0
  ))

attritdf <- attritdf %>% mutate(GenNum = case_when(
  Gender == "Male" ~ 1,
  Gender == "Female" ~ 0
))
noattritdf <- noattritdf %>% mutate(GenNum = case_when(
  Gender == "Male" ~ 1,
  Gender == "Female" ~ 0
))

attritdf <- attritdf %>% mutate(TravLevel = case_when(
  BusinessTravel == "Non-Travel" ~ 0,
  BusinessTravel == "Travel_Rarely" ~ 1,
  BusinessTravel == "Travel_Frequently" ~ 2
))
noattritdf <- noattritdf %>% mutate(TravLevel = case_when(
  BusinessTravel == "Non-Travel" ~ 0,
  BusinessTravel == "Travel_Rarely" ~ 1,
  BusinessTravel == "Travel_Frequently" ~ 2
))


attritdf <- attritdf %>% mutate(OTNum = case_when(
  OverTime == "Yes" ~ 1,
  OverTime == "No" ~0
))
noattritdf <- noattritdf %>% mutate(OTNum = case_when(
  OverTime == "Yes" ~ 1,
  OverTime == "No" ~0
))

attritdf <- attritdf %>% mutate(MariStatNum = case_when(
  MaritalStatus == "Single" ~ 0,
  MaritalStatus == "Married" ~ 1,
  MaritalStatus == "Divorced" ~ 2
))
noattritdf <- noattritdf %>% mutate(MariStatNum = case_when(
  MaritalStatus == "Single" ~ 0,
  MaritalStatus == "Married" ~ 1,
  MaritalStatus == "Divorced" ~ 2
))

attritdf <- attritdf %>% mutate(EducNum = case_when(
  EducationField == "Medical" ~ 0,
  EducationField == "Life Sciences" ~ 1,
  EducationField == "Marketing" ~ 2,
  EducationField == "Technical Degree" ~ 3,
  EducationField == "Human Resources" ~ 4,
  EducationField == "Other" ~ 5
))
noattritdf <- noattritdf %>% mutate(EducNum = case_when(
  EducationField == "Medical" ~ 0,
  EducationField == "Life Sciences" ~ 1,
  EducationField == "Marketing" ~ 2,
  EducationField == "Technical Degree" ~ 3,
  EducationField == "Human Resources" ~ 4,
  EducationField == "Other" ~ 5
))

attritdf <- attritdf %>% mutate(JRoleNum = case_when(
  JobRole == "Research Director" ~ 0,
  JobRole == "Manufacturing Director" ~ 1,
  JobRole == "Sales Executive" ~ 2,
  JobRole == "Research Scientist" ~ 3,
  JobRole == "Sales Representative" ~ 4,
  JobRole == "Healthcare Representative" ~ 5,
  JobRole == "Manager" ~ 6,
  JobRole == "Laboratory Technician" ~ 7,
  JobRole == "Human Resources" ~ 8
))
noattritdf <- noattritdf %>% mutate(JRoleNum = case_when(
  JobRole == "Research Director" ~ 0,
  JobRole == "Manufacturing Director" ~ 1,
  JobRole == "Sales Executive" ~ 2,
  JobRole == "Research Scientist" ~ 3,
  JobRole == "Sales Representative" ~ 4,
  JobRole == "Healthcare Representative" ~ 5,
  JobRole == "Manager" ~ 6,
  JobRole == "Laboratory Technician" ~ 7,
  JobRole == "Human Resources" ~ 8
))       

allnumeric<-attritdf%>%dplyr::select(Age,AgeGrp,AttrNum,TravLevel,DailyRate,DistanceFromHome,Education,EducNum,EnvironmentSatisfaction,GenNum,HourlyRate,JobInvolvement,JobLevel,JRoleNum,JobSatisfaction,MariStatNum,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OTNum,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager)

Convert integers to factors where values are categorical

attritdf$JobInvolvement <- as.factor(attritdf$JobInvolvement)
noattritdf$JobInvolvement <- as.factor(noattritdf$JobInvolvement)
nosalarydf$JobInvolvement <- as.factor(nosalarydf$JobInvolvement)

attritdf$JobLevel <- as.factor(attritdf$JobLevel)
noattritdf$JobLevel <- as.factor(noattritdf$JobLevel)
nosalarydf$JobLevel <- as.factor(nosalarydf$JobLevel)

attritdf$JobSatisfaction <- as.factor(attritdf$JobSatisfaction)
noattritdf$JobSatisfaction <- as.factor(noattritdf$JobSatisfaction)
nosalarydf$JobSatisfaction <- as.factor(nosalarydf$JobSatisfaction)

attritdf$PerformanceRating <- as.factor(attritdf$PerformanceRating)
noattritdf$PerformanceRating <- as.factor(noattritdf$PerformanceRating)
nosalarydf$PerformanceRating <- as.factor(nosalarydf$PerformanceRating)

attritdf$RelationshipSatisfaction <- as.factor(attritdf$RelationshipSatisfaction)
noattritdf$RelationshipSatisfaction <- as.factor(noattritdf$RelationshipSatisfaction)
nosalarydf$RelationshipSatisfaction <- as.factor(nosalarydf$RelationshipSatisfaction)

attritdf$StockOptionLevel <- as.factor(attritdf$StockOptionLevel)
nosalarydf$StockOptionLevel <- as.factor(nosalarydf$StockOptionLevel)
noattritdf$StockOptionLevel <- as.factor(noattritdf$StockOptionLevel)

attritdf$TrainingTimesLastYear <- as.factor(attritdf$TrainingTimesLastYear)
noattritdf$TrainingTimesLastYear <- as.factor(noattritdf$TrainingTimesLastYear)
nosalarydf$TrainingTimesLastYear <- as.factor(nosalarydf$TrainingTimesLastYear)

attritdf$WorkLifeBalance <- as.factor(attritdf$WorkLifeBalance)
noattritdf$WorkLifeBalance <- as.factor(noattritdf$WorkLifeBalance)
nosalarydf$WorkLifeBalance <- as.factor(nosalarydf$WorkLifeBalance)

attritdf$Education <- as.factor(attritdf$Education)
noattritdf$Education <- as.factor(noattritdf$Education)
nosalarydf$Education <- as.factor(nosalarydf$Education)

attritdf$EnvironmentSatisfaction <- as.factor(attritdf$EnvironmentSatisfaction)
noattritdf$EnvironmentSatisfaction <- as.factor(noattritdf$EnvironmentSatisfaction)
nosalarydf$EnvironmentSatisfaction <- as.factor(nosalarydf$EnvironmentSatisfaction)

EDA Section

Analysis with DataExplorer library

str(attritdf)
## 'data.frame':    870 obs. of  44 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : Factor w/ 5 levels "1","2","3","4",..: 2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : Factor w/ 2 levels "3","4": 1 1 1 1 1 2 1 1 1 1 ...
##  $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : Factor w/ 4 levels "0","1","2","3": 2 1 1 3 1 3 1 4 2 2 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : Factor w/ 7 levels "0","1","2","3",..: 4 3 3 4 3 5 6 6 3 4 ...
##  $ WorkLifeBalance         : Factor w/ 4 levels "1","2","3","4": 2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
##  $ AgeGrp                  : Factor w/ 5 levels "18-24","25-34",..: 2 3 3 2 1 2 3 3 2 2 ...
##  $ AttrNum                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ GenNum                  : num  1 1 1 0 0 1 1 0 0 1 ...
##  $ TravLevel               : num  1 1 2 1 2 2 1 1 1 2 ...
##  $ OTNum                   : num  0 0 0 0 1 0 1 1 1 0 ...
##  $ MariStatNum             : num  2 0 0 1 0 2 1 2 1 1 ...
##  $ EducNum                 : num  1 0 1 2 3 1 0 1 1 3 ...
##  $ JRoleNum                : num  2 0 1 2 3 1 3 2 4 5 ...
names(attritdf)
##  [1] "ID"                       "Age"                     
##  [3] "Attrition"                "BusinessTravel"          
##  [5] "DailyRate"                "Department"              
##  [7] "DistanceFromHome"         "Education"               
##  [9] "EducationField"           "EmployeeCount"           
## [11] "EmployeeNumber"           "EnvironmentSatisfaction" 
## [13] "Gender"                   "HourlyRate"              
## [15] "JobInvolvement"           "JobLevel"                
## [17] "JobRole"                  "JobSatisfaction"         
## [19] "MaritalStatus"            "MonthlyIncome"           
## [21] "MonthlyRate"              "NumCompaniesWorked"      
## [23] "Over18"                   "OverTime"                
## [25] "PercentSalaryHike"        "PerformanceRating"       
## [27] "RelationshipSatisfaction" "StandardHours"           
## [29] "StockOptionLevel"         "TotalWorkingYears"       
## [31] "TrainingTimesLastYear"    "WorkLifeBalance"         
## [33] "YearsAtCompany"           "YearsInCurrentRole"      
## [35] "YearsSinceLastPromotion"  "YearsWithCurrManager"    
## [37] "AgeGrp"                   "AttrNum"                 
## [39] "GenNum"                   "TravLevel"               
## [41] "OTNum"                    "MariStatNum"             
## [43] "EducNum"                  "JRoleNum"
describe(attritdf)
## attritdf 
## 
##  44  Variables      870  Observations
## ---------------------------------------------------------------------------
## ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0      870        1    435.5    290.3    44.45    87.90 
##      .25      .50      .75      .90      .95 
##   218.25   435.50   652.75   783.10   826.55 
## 
## lowest :   1   2   3   4   5, highest: 866 867 868 869 870
## ---------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       43    0.999    36.83    10.07       24       26 
##      .25      .50      .75      .90      .95 
##       30       35       43       50       54 
## 
## lowest : 18 19 20 21 22, highest: 56 57 58 59 60
## ---------------------------------------------------------------------------
## Attrition 
##        n  missing distinct 
##      870        0        2 
##                       
## Value         No   Yes
## Frequency    730   140
## Proportion 0.839 0.161
## ---------------------------------------------------------------------------
## BusinessTravel 
##        n  missing distinct 
##      870        0        3 
##                                                                 
## Value             Non-Travel Travel_Frequently     Travel_Rarely
## Frequency                 94               158               618
## Proportion             0.108             0.182             0.710
## ---------------------------------------------------------------------------
## DailyRate 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0      627        1    815.2    463.3    175.4    257.8 
##      .25      .50      .75      .90      .95 
##    472.5    817.5   1165.8   1368.0   1436.7 
## 
## lowest :  103  111  117  119  120, highest: 1490 1495 1496 1498 1499
## ---------------------------------------------------------------------------
## Department 
##        n  missing distinct 
##      870        0        3 
##                                                         
## Value             Human Resources Research & Development
## Frequency                      35                    562
## Proportion                  0.040                  0.646
##                                  
## Value                       Sales
## Frequency                     273
## Proportion                  0.314
## ---------------------------------------------------------------------------
## DistanceFromHome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       29    0.993    9.339    8.843      1.0      1.0 
##      .25      .50      .75      .90      .95 
##      2.0      7.0     14.0     23.1     26.0 
## 
## lowest :  1  2  3  4  5, highest: 25 26 27 28 29
## ---------------------------------------------------------------------------
## Education 
##        n  missing distinct 
##      870        0        5 
## 
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##                                         
## Value          1     2     3     4     5
## Frequency     98   182   324   240    26
## Proportion 0.113 0.209 0.372 0.276 0.030
## ---------------------------------------------------------------------------
## EducationField 
##        n  missing distinct 
##      870        0        6 
## 
## lowest : Human Resources  Life Sciences    Marketing        Medical          Other           
## highest: Life Sciences    Marketing        Medical          Other            Technical Degree
##                                                              
## Value       Human Resources    Life Sciences        Marketing
## Frequency                15              358              100
## Proportion            0.017            0.411            0.115
##                                                              
## Value               Medical            Other Technical Degree
## Frequency               270               52               75
## Proportion            0.310            0.060            0.086
## ---------------------------------------------------------------------------
## EmployeeCount 
##        n  missing distinct     Info     Mean      Gmd 
##      870        0        1        0        1        0 
##               
## Value        1
## Frequency  870
## Proportion   1
## ---------------------------------------------------------------------------
## EmployeeNumber 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0      870        1     1030    698.6     86.9    191.1 
##      .25      .50      .75      .90      .95 
##    477.2   1039.0   1561.5   1856.2   1958.3 
## 
## lowest :    1    4   11   13   14, highest: 2041 2053 2056 2062 2064
## ---------------------------------------------------------------------------
## EnvironmentSatisfaction 
##        n  missing distinct 
##      870        0        4 
##                                   
## Value          1     2     3     4
## Frequency    172   178   258   262
## Proportion 0.198 0.205 0.297 0.301
## ---------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      870        0        2 
##                         
## Value      Female   Male
## Frequency     354    516
## Proportion  0.407  0.593
## ---------------------------------------------------------------------------
## HourlyRate 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       71        1    65.61    23.24       34       39 
##      .25      .50      .75      .90      .95 
##       48       66       83       94       97 
## 
## lowest :  30  31  32  33  34, highest:  96  97  98  99 100
## ---------------------------------------------------------------------------
## JobInvolvement 
##        n  missing distinct 
##      870        0        4 
##                                   
## Value          1     2     3     4
## Frequency     47   228   514    81
## Proportion 0.054 0.262 0.591 0.093
## ---------------------------------------------------------------------------
## JobLevel 
##        n  missing distinct 
##      870        0        5 
## 
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##                                         
## Value          1     2     3     4     5
## Frequency    329   312   132    60    37
## Proportion 0.378 0.359 0.152 0.069 0.043
## ---------------------------------------------------------------------------
## JobRole 
##        n  missing distinct 
##      870        0        9 
## 
## lowest : Healthcare Representative Human Resources           Laboratory Technician     Manager                   Manufacturing Director   
## highest: Manufacturing Director    Research Director         Research Scientist        Sales Executive           Sales Representative     
## 
## Healthcare Representative (76, 0.087), Human Resources (27, 0.031),
## Laboratory Technician (153, 0.176), Manager (51, 0.059), Manufacturing
## Director (87, 0.100), Research Director (51, 0.059), Research Scientist
## (172, 0.198), Sales Executive (200, 0.230), Sales Representative (53,
## 0.061)
## ---------------------------------------------------------------------------
## JobSatisfaction 
##        n  missing distinct 
##      870        0        4 
##                                   
## Value          1     2     3     4
## Frequency    179   166   254   271
## Proportion 0.206 0.191 0.292 0.311
## ---------------------------------------------------------------------------
## MaritalStatus 
##        n  missing distinct 
##      870        0        3 
##                                      
## Value      Divorced  Married   Single
## Frequency       191      410      269
## Proportion    0.220    0.471    0.309
## ---------------------------------------------------------------------------
## MonthlyIncome 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0      826        1     6390     4757     2088     2279 
##      .25      .50      .75      .90      .95 
##     2840     4946     8182    13571    17165 
## 
## lowest :  1081  1091  1102  1118  1129, highest: 19845 19859 19926 19943 19999
## ---------------------------------------------------------------------------
## MonthlyRate 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0      852        1    14326     8210     3456     4751 
##      .25      .50      .75      .90      .95 
##     8092    14074    20456    24045    25541 
## 
## lowest :  2094  2104  2112  2125  2137, highest: 26862 26933 26959 26968 26997
## ---------------------------------------------------------------------------
## NumCompaniesWorked 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       10    0.945    2.728    2.683        0        0 
##      .25      .50      .75      .90      .95 
##        1        2        4        7        8 
## 
## lowest : 0 1 2 3 4, highest: 5 6 7 8 9
##                                                                       
## Value          0     1     2     3     4     5     6     7     8     9
## Frequency    111   320    74    91    85    43    39    46    28    33
## Proportion 0.128 0.368 0.085 0.105 0.098 0.049 0.045 0.053 0.032 0.038
## ---------------------------------------------------------------------------
## Over18 
##        n  missing distinct    value 
##      870        0        1        Y 
##               
## Value        Y
## Frequency  870
## Proportion   1
## ---------------------------------------------------------------------------
## OverTime 
##        n  missing distinct 
##      870        0        2 
##                     
## Value        No  Yes
## Frequency   618  252
## Proportion 0.71 0.29
## ---------------------------------------------------------------------------
## PercentSalaryHike 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       15    0.988     15.2    4.047       11       11 
##      .25      .50      .75      .90      .95 
##       12       14       18       21       22 
## 
## lowest : 11 12 13 14 15, highest: 21 22 23 24 25
##                                                                       
## Value         11    12    13    14    15    16    17    18    19    20
## Frequency    126   119   123   120    54    43    56    57    40    27
## Proportion 0.145 0.137 0.141 0.138 0.062 0.049 0.064 0.066 0.046 0.031
##                                         
## Value         21    22    23    24    25
## Frequency     33    30    17    14    11
## Proportion 0.038 0.034 0.020 0.016 0.013
## ---------------------------------------------------------------------------
## PerformanceRating 
##        n  missing distinct 
##      870        0        2 
##                       
## Value          3     4
## Frequency    738   132
## Proportion 0.848 0.152
## ---------------------------------------------------------------------------
## RelationshipSatisfaction 
##        n  missing distinct 
##      870        0        4 
##                                   
## Value          1     2     3     4
## Frequency    174   171   261   264
## Proportion 0.200 0.197 0.300 0.303
## ---------------------------------------------------------------------------
## StandardHours 
##        n  missing distinct     Info     Mean      Gmd 
##      870        0        1        0       80        0 
##               
## Value       80
## Frequency  870
## Proportion   1
## ---------------------------------------------------------------------------
## StockOptionLevel 
##        n  missing distinct 
##      870        0        4 
##                                   
## Value          0     1     2     3
## Frequency    379   355    81    55
## Proportion 0.436 0.408 0.093 0.063
## ---------------------------------------------------------------------------
## TotalWorkingYears 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       39    0.995    11.05    8.048        1        3 
##      .25      .50      .75      .90      .95 
##        6       10       15       22       26 
## 
## lowest :  0  1  2  3  4, highest: 34 35 36 37 40
## ---------------------------------------------------------------------------
## TrainingTimesLastYear 
##        n  missing distinct 
##      870        0        7 
## 
## lowest : 0 1 2 3 4, highest: 2 3 4 5 6
##                                                     
## Value          0     1     2     3     4     5     6
## Frequency     30    39   309   308    73    75    36
## Proportion 0.034 0.045 0.355 0.354 0.084 0.086 0.041
## ---------------------------------------------------------------------------
## WorkLifeBalance 
##        n  missing distinct 
##      870        0        4 
##                                   
## Value          1     2     3     4
## Frequency     48   192   532    98
## Proportion 0.055 0.221 0.611 0.113
## ---------------------------------------------------------------------------
## YearsAtCompany 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       32    0.993    6.962    6.208        1        1 
##      .25      .50      .75      .90      .95 
##        3        5       10       15       20 
## 
## lowest :  0  1  2  3  4, highest: 30 31 32 33 40
## ---------------------------------------------------------------------------
## YearsInCurrentRole 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       19    0.973    4.205    3.967        0        0 
##      .25      .50      .75      .90      .95 
##        2        3        7        9       11 
## 
## lowest :  0  1  2  3  4, highest: 14 15 16 17 18
##                                                                       
## Value          0     1     2     3     4     5     6     7     8     9
## Frequency    151    38   223    68    53    26    17   136    56    40
## Proportion 0.174 0.044 0.256 0.078 0.061 0.030 0.020 0.156 0.064 0.046
##                                                                 
## Value         10    11    12    13    14    15    16    17    18
## Frequency     14    15     7     9     7     3     3     3     1
## Proportion 0.016 0.017 0.008 0.010 0.008 0.003 0.003 0.003 0.001
## ---------------------------------------------------------------------------
## YearsSinceLastPromotion 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       16    0.923    2.169    2.961        0        0 
##      .25      .50      .75      .90      .95 
##        0        1        3        7        9 
## 
## lowest :  0  1  2  3  4, highest: 11 12 13 14 15
##                                                                       
## Value          0     1     2     3     4     5     6     7     8     9
## Frequency    342   214    94    32    32    30    23    41    12     9
## Proportion 0.393 0.246 0.108 0.037 0.037 0.034 0.026 0.047 0.014 0.010
##                                               
## Value         10    11    12    13    14    15
## Frequency      4    14     5     5     5     8
## Proportion 0.005 0.016 0.006 0.006 0.006 0.009
## ---------------------------------------------------------------------------
## YearsWithCurrManager 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870        0       17    0.976     4.14    3.938        0        0 
##      .25      .50      .75      .90      .95 
##        2        3        7        9       10 
## 
## lowest :  0  1  2  3  4, highest: 12 13 14 15 17
##                                                                       
## Value          0     1     2     3     4     5     6     7     8     9
## Frequency    166    40   202    76    51    22    12   131    68    44
## Proportion 0.191 0.046 0.232 0.087 0.059 0.025 0.014 0.151 0.078 0.051
##                                                     
## Value         10    11    12    13    14    15    17
## Frequency     18    11    13     7     4     1     4
## Proportion 0.021 0.013 0.015 0.008 0.005 0.001 0.005
## ---------------------------------------------------------------------------
## AgeGrp 
##        n  missing distinct 
##      870        0        5 
## 
## lowest : 18-24 25-34 35-44 46-54 55+  , highest: 18-24 25-34 35-44 46-54 55+  
##                                         
## Value      18-24 25-34 35-44 46-54   55+
## Frequency     55   336   296   143    40
## Proportion 0.063 0.386 0.340 0.164 0.046
## ---------------------------------------------------------------------------
## AttrNum 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      870        0        2    0.405      140   0.1609   0.2704 
## 
## ---------------------------------------------------------------------------
## GenNum 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      870        0        2    0.724      516   0.5931   0.4832 
## 
## ---------------------------------------------------------------------------
## TravLevel 
##        n  missing distinct     Info     Mean      Gmd 
##      870        0        3    0.634    1.074   0.4906 
##                             
## Value          0     1     2
## Frequency     94   618   158
## Proportion 0.108 0.710 0.182
## ---------------------------------------------------------------------------
## OTNum 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      870        0        2    0.617      252   0.2897    0.412 
## 
## ---------------------------------------------------------------------------
## MariStatNum 
##        n  missing distinct     Info     Mean      Gmd 
##      870        0        3    0.855   0.9103   0.7708 
##                             
## Value          0     1     2
## Frequency    269   410   191
## Proportion 0.309 0.471 0.220
## ---------------------------------------------------------------------------
## EducNum 
##        n  missing distinct     Info     Mean      Gmd 
##      870        0        6    0.898    1.268    1.359 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##                                               
## Value          0     1     2     3     4     5
## Frequency    270   358   100    75    15    52
## Proportion 0.310 0.411 0.115 0.086 0.017 0.060
## ---------------------------------------------------------------------------
## JRoleNum 
##        n  missing distinct     Info     Mean      Gmd 
##      870        0        9    0.972    3.664    2.576 
## 
## lowest : 0 1 2 3 4, highest: 4 5 6 7 8
##                                                                 
## Value          0     1     2     3     4     5     6     7     8
## Frequency     51    87   200   172    53    76    51   153    27
## Proportion 0.059 0.100 0.230 0.198 0.061 0.087 0.059 0.176 0.031
## ---------------------------------------------------------------------------
plot_intro(attritdf)

plot_missing(attritdf)

plot_bar(attritdf)

plot_histogram(attritdf)

plot_qq(attritdf)

# Correlationapalooza
plot_correlation(allnumeric, type = c("all", "discrete", "continuous"),
  maxcat = 20L, cor_args = list(), geom_text_args = list(),
  title = NULL, ggtheme = theme_gray(),
  theme_config = list(legend.position = "bottom", axis.text.x =
  element_text(angle = 90)))

# Correlation with just continuous variables
attritcor1 <- attritdf%>%dplyr::select(NumCompaniesWorked,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,YearsAtCompany)
plot_correlation(attritcor1)

attrcont <- attritdf%>%dplyr::select(Attrition,Age,NumCompaniesWorked,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,YearsAtCompany,MonthlyIncome,TotalWorkingYears)
attrcont %>%
  filter(Attrition == "Yes") %>%
  select_if(is.numeric) %>%
  cor() %>%
  corrplot::corrplot()

attritdf %>% 
  dplyr::select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate, YearsAtCompany, YearsWithCurrManager, YearsSinceLastPromotion) %>% 
  gather(metric, value) %>% 
  ggplot(aes(value, fill = metric)) + 
  geom_density(show.legend = FALSE) + 
  facet_wrap(~ metric, scales = "free")

attritdf %>% 
  dplyr::select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate, YearsAtCompany, YearsWithCurrManager, YearsSinceLastPromotion) %>% 
  gather(metric, value) %>% 
  ggplot(aes(value, fill = metric)) + 
  geom_histogram(show.legend = FALSE) + 
  facet_wrap(~ metric, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatterplot analysis

plot_scatterplot(attritdf, by="MonthlyIncome")

plot_scatterplot(attritdf, by="YearsAtCompany")

plot_scatterplot(attritdf, by="YearsInCurrentRole")

plot_scatterplot(attritdf, by="YearsSinceLastPromotion")

plot_scatterplot(attritdf, by="YearsWithCurrManager")

plot_scatterplot(attritdf, by="NumCompaniesWorked")

Explore AgeGrp

attritdf%>%ggplot(aes(x=MonthlyIncome,YearsAtCompany,color=AgeGrp))+geom_point()

attritdf%>%ggplot(aes(x=MonthlyIncome,YearsAtCompany,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

attritdf%>%ggplot(aes(x=YearsSinceLastPromotion,PercentSalaryHike,color=AgeGrp))+geom_point()

attritdf%>%ggplot(aes(x=YearsSinceLastPromotion,PercentSalaryHike,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.015
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.015
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1096e-015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.05
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.05
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 1.1096e-015
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 4

attritdf%>%ggplot(aes(x=YearsWithCurrManager,MonthlyIncome,color=AgeGrp))+geom_point()

attritdf%>%ggplot(aes(x=YearsWithCurrManager,MonthlyIncome,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 1

attritdf%>%ggplot(aes(x=YearsAtCompany,MonthlyIncome,color=AgeGrp))+geom_point()

attritdf%>%ggplot(aes(x=YearsAtCompany,MonthlyIncome,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Attrition by Age Group
AgeGrpdf <- attritdf%>%dplyr::select(AgeGrp,Attrition)
AgeGrpdf <- AgeGrpdf%>%filter(Attrition=="Yes")
AgeGrpplot <- ggplot(AgeGrpdf,aes(x=AgeGrp,group=Attrition,color=AgeGrp))+geom_bar(aes(y = ..prop..,fill=factor(..x..)), stat="count")+ggtitle("Attrition by Age Group")+
  geom_text(aes(label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.1)+labs(y = "Percent")+ylab(NULL)+xlab("Age groups")

AgeGrpplot

Attrition by JobRole

AgeJLev <- attritdf%>%dplyr::select(JobLevel,Attrition)
AgeJLev <- AgeJLev%>%filter(Attrition=="Yes")

AgeJLevplot <- ggplot(AgeJLev,aes(x=JobLevel,group=Attrition,color=JobLevel))+geom_bar(aes(y = ..prop..,fill=factor(..x..)), stat="count")+ggtitle("Attrition by Job Level")+
geom_text(aes(label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.1)+labs(y = "Percent")+ylab(NULL)+xlab("Job Levels")

AgeJLevplot

This section creates a dataframe with an abbreviated jobrole title jrole, and calculates the amount of attrition and non-attrition by job role.

tempdf <- attritdf%>%dplyr::select(JobRole,Attrition,MonthlyIncome)%>%group_by(JobRole)
jrno <- tempdf%>%filter(Attrition=="Yes")
jrno <- jrno%>%summarise(AttrCnt=n())
jry <- tempdf%>%filter(Attrition=="No")
jry <- tempdf%>%summarise(NoAttrCnt=n())
AttrJobRoledf <- merge(jry,jrno)
AttrJobRoledf <- AttrJobRoledf %>% mutate(JRole = case_when(
  JobRole == "Healthcare Representative" ~ "HC_Rep",
  JobRole == "Human Resources" ~ "HR",
  JobRole == "Laboratory Technician" ~ "Lab_Tech",
  JobRole == "Manager" ~ "Mgr",
  JobRole == "Manufacturing Director" ~ "Manfact_Dir",
  JobRole == "Research Director" ~ "Re_Dir",
  JobRole == "Research Scientist" ~ "Re_Scientist",
  JobRole == "Sales Executive" ~ "SalesExec",
  JobRole == "Sales Representative" ~ "SalesRep"
))

MonthlyIncome by JobRole

ggplot(tempdf,aes(x=NULL,y=MonthlyIncome,fill=JobRole))+geom_boxplot()+ggtitle("Summary statistics by Monthly Income and Job Role")+ylab("Monthly Income")+xlab("Job Role")

attritdf%>%ggplot()+geom_bar(aes(y=JobSatisfaction,x=JobRole,fill=JobSatisfaction),stat="identity",size=4)+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ylab(NULL)+xlab(NULL)+ggtitle("Job Satisfaction Scores by Job Role")

This section orders the data and reduces to top Job Roles with the most attrition.

ggplot(AttrJobRoledf,aes(x=JobRole,y=AttrCnt))+geom_point()+ylab("Attrition Count")+xlab("Job Role")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

# MonthlyIncome by job role
IncomeJobRolePlot <- ggplot(tempdf,aes(JobRole, MonthlyIncome,fill=JobRole))+geom_bar(stat = "identity",position = position_stack(reverse = TRUE))+coord_flip()
IncomeJobRolePlot

# Continuous variable relationship analysis
# MonthlyIncome,YearsAtCompany,PercentSalaryHike,NumCompaniesWorked,TotalWorkingYears,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager

# Attrition by gender
AttritReduced <- attritdf%>%dplyr::select(Attrition,Age,AgeGrp,Gender,YearsSinceLastPromotion,YearsWithCurrManager,JobRole,)%>%group_by(Attrition="Yes")

AttritReduced%>%ggplot(aes(x=Gender,fill=Gender))+geom_bar()+ggtitle("Bar chart of Attrition by Gender")+xlab(NULL)+ylab(NULL)+geom_text(stat = "count",aes(label=..count..))+theme(legend.position = "none")

Top factors that contribute to turnover multi linear regression

Test assumptions with residual plots and histogram of residuals. Inital qq plots were not as linear as I would have liked but appeared worse after log transforming. I also looked at sqr the continuous variables but did not like how the data turned out.

set.seed(123)
training.samples <- attritdf$AttrNum %>%
createDataPartition(p=0.8, list = FALSE)
train.data <- attritdf[training.samples, ]
test.data <- attritdf[-training.samples, ]

attritdflog <- attritdf%>%dplyr::select(MonthlyIncome,Attrition,AttrNum,Age,GenNum,YearsInCurrentRole,TotalWorkingYears,TrainingTimesLastYear,YearsSinceLastPromotion,JobSatisfaction,PerformanceRating,RelationshipSatisfaction,EnvironmentSatisfaction,DistanceFromHome,StockOptionLevel,NumCompaniesWorked,JobInvolvement,JRoleNum,EducNum,OTNum,TravLevel,MariStatNum)%>%mutate(logYSLP = log(YearsSinceLastPromotion),logDFH=log(DistanceFromHome),logNCW=log(NumCompaniesWorked),logTWY=log(TotalWorkingYears),sqrYSLP=(YearsSinceLastPromotion)^2)

# Initial model with most variables that made sense
model1<-lm(AttrNum~Age+GenNum+YearsInCurrentRole+TotalWorkingYears+TrainingTimesLastYear+YearsSinceLastPromotion+JobSatisfaction+PerformanceRating+RelationshipSatisfaction+EnvironmentSatisfaction+DistanceFromHome+StockOptionLevel+NumCompaniesWorked+JobInvolvement+JRoleNum+EducNum+OTNum+TravLevel+MariStatNum,data=attritdflog)

outlierTest(model1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 485 3.974266         7.6709e-05     0.066737
qqPlot(model1,main = "QQ Plot")
## Warning in rlm.default(x, y, weights, method = method, wt.method =
## wt.method, : 'rlm' failed to converge in 20 steps

## [1] 485 860
leveragePlots(model1,main = "Leverage Plots")

spreadLevelPlot(model1)
## Warning in spreadLevelPlot.lm(model1): 
## 183 negative fitted values removed

## 
## Suggested power transformation:  -0.0007314523
ncvTest(model1)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 161.8543, Df = 1, p = < 2.22e-16
crPlots(model1)

mod1resid = rstudent(model1)
hist(mod1resid,col="red",main = "Histogram of residuals for initial model",ylab = NULL,xlab = NULL)

plot(model1)

summary(model1)
## 
## Call:
## lm(formula = AttrNum ~ Age + GenNum + YearsInCurrentRole + TotalWorkingYears + 
##     TrainingTimesLastYear + YearsSinceLastPromotion + JobSatisfaction + 
##     PerformanceRating + RelationshipSatisfaction + EnvironmentSatisfaction + 
##     DistanceFromHome + StockOptionLevel + NumCompaniesWorked + 
##     JobInvolvement + JRoleNum + EducNum + OTNum + TravLevel + 
##     MariStatNum, data = attritdflog)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62624 -0.20117 -0.07257  0.08974  1.21118 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.8015850  0.1056067   7.590 8.53e-14 ***
## Age                       -0.0033755  0.0016452  -2.052 0.040505 *  
## GenNum                     0.0175231  0.0222197   0.789 0.430553    
## YearsInCurrentRole        -0.0096504  0.0039194  -2.462 0.014011 *  
## TotalWorkingYears         -0.0072845  0.0022859  -3.187 0.001492 ** 
## TrainingTimesLastYear1    -0.2070113  0.0786496  -2.632 0.008643 ** 
## TrainingTimesLastYear2    -0.1360809  0.0613569  -2.218 0.026833 *  
## TrainingTimesLastYear3    -0.1652211  0.0613815  -2.692 0.007251 ** 
## TrainingTimesLastYear4    -0.1308795  0.0698904  -1.873 0.061468 .  
## TrainingTimesLastYear5    -0.1732064  0.0692364  -2.502 0.012551 *  
## TrainingTimesLastYear6    -0.2042315  0.0794925  -2.569 0.010366 *  
## YearsSinceLastPromotion    0.0176616  0.0042659   4.140 3.82e-05 ***
## JobSatisfaction2          -0.0292029  0.0343761  -0.850 0.395839    
## JobSatisfaction3          -0.0613269  0.0313729  -1.955 0.050943 .  
## JobSatisfaction4          -0.1328299  0.0309672  -4.289 2.00e-05 ***
## PerformanceRating4         0.0205147  0.0301432   0.681 0.496329    
## RelationshipSatisfaction2 -0.0675254  0.0344835  -1.958 0.050540 .  
## RelationshipSatisfaction3 -0.0771347  0.0313699  -2.459 0.014139 *  
## RelationshipSatisfaction4 -0.0869954  0.0312165  -2.787 0.005443 ** 
## EnvironmentSatisfaction2  -0.1340363  0.0343595  -3.901 0.000103 ***
## EnvironmentSatisfaction3  -0.1181239  0.0315743  -3.741 0.000196 ***
## EnvironmentSatisfaction4  -0.1188642  0.0314784  -3.776 0.000171 ***
## DistanceFromHome           0.0041932  0.0013440   3.120 0.001871 ** 
## StockOptionLevel1         -0.1045102  0.0334239  -3.127 0.001828 ** 
## StockOptionLevel2         -0.1425094  0.0470130  -3.031 0.002510 ** 
## StockOptionLevel3         -0.0004506  0.0549156  -0.008 0.993455    
## NumCompaniesWorked         0.0178394  0.0047256   3.775 0.000171 ***
## JobInvolvement2           -0.2255851  0.0511692  -4.409 1.18e-05 ***
## JobInvolvement3           -0.2783164  0.0488008  -5.703 1.63e-08 ***
## JobInvolvement4           -0.3223167  0.0588934  -5.473 5.86e-08 ***
## JRoleNum                   0.0111032  0.0048505   2.289 0.022324 *  
## EducNum                    0.0155647  0.0081165   1.918 0.055496 .  
## OTNum                      0.2078221  0.0241489   8.606  < 2e-16 ***
## TravLevel                  0.0582844  0.0205202   2.840 0.004616 ** 
## MariStatNum               -0.0353712  0.0228305  -1.549 0.121689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3158 on 835 degrees of freedom
## Multiple R-squared:  0.291,  Adjusted R-squared:  0.2621 
## F-statistic: 10.08 on 34 and 835 DF,  p-value: < 2.2e-16
car::vif(model1)
##                              GVIF Df GVIF^(1/(2*Df))
## Age                      1.878618  1        1.370627
## GenNum                   1.039202  1        1.019413
## YearsInCurrentRole       1.772554  1        1.331373
## TotalWorkingYears        2.569938  1        1.603103
## TrainingTimesLastYear    1.217523  6        1.016537
## YearsSinceLastPromotion  1.609147  1        1.268522
## JobSatisfaction          1.095869  3        1.015375
## PerformanceRating        1.019952  1        1.009927
## RelationshipSatisfaction 1.094314  3        1.015135
## EnvironmentSatisfaction  1.087170  3        1.014027
## DistanceFromHome         1.041902  1        1.020736
## StockOptionLevel         2.550517  3        1.168884
## NumCompaniesWorked       1.235894  1        1.111708
## JobInvolvement           1.090603  3        1.014560
## JRoleNum                 1.076026  1        1.037317
## EducNum                  1.039921  1        1.019765
## OTNum                    1.046537  1        1.023004
## TravLevel                1.043908  1        1.021718
## MariStatNum              2.367149  1        1.538554
# model after looking at p-values of multi-linear regression and their VIF scores
ggplot(attritdf,aes(sample=YearsSinceLastPromotion,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - YearsSinceLastPromotionw with Attrition")+xlab(NULL)+ylab(NULL)

#ggplot(attritdflog,aes(sample=logYSLP,colour=Attrition))+stat_qq()+stat_qq_line()

ggplot(attritdf,aes(sample=DistanceFromHome,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - DistanceFromHome with Attrition")+xlab(NULL)+ylab(NULL)

#ggplot(attritdflog,aes(sample=logDFH,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - log transformed DistanceFromHome with Attrition")

ggplot(attritdf,aes(sample=NumCompaniesWorked,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - NumCompaniesWorked with Attrition")+xlab(NULL)+ylab(NULL)

#ggplot(attritdflog,aes(sample=logNCW,colour=AttrNum))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - log transformed NumCompaniesWorked with Attrition")

ggplot(attritdflog,aes(sample=TotalWorkingYears,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - TotalWorkingYears with Attrition")+xlab(NULL)+ylab(NULL)

#ggplot(attritdflog,aes(sample=logTWY,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - log transformed TotalWorkingYears with Attrition")


model2 <- lm(AttrNum~YearsSinceLastPromotion+JobSatisfaction+EnvironmentSatisfaction+DistanceFromHome+NumCompaniesWorked+JobInvolvement+OTNum, data=attritdflog)

summary(model2)
## 
## Call:
## lm(formula = AttrNum ~ YearsSinceLastPromotion + JobSatisfaction + 
##     EnvironmentSatisfaction + DistanceFromHome + NumCompaniesWorked + 
##     JobInvolvement + OTNum, data = attritdflog)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60343 -0.18719 -0.08282  0.00263  1.03786 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.4782913  0.0646736   7.395 3.36e-13 ***
## YearsSinceLastPromotion  -0.0001114  0.0036479  -0.031 0.975655    
## JobSatisfaction2         -0.0191338  0.0367817  -0.520 0.603059    
## JobSatisfaction3         -0.0457316  0.0333490  -1.371 0.170640    
## JobSatisfaction4         -0.1206412  0.0329505  -3.661 0.000266 ***
## EnvironmentSatisfaction2 -0.1166118  0.0366180  -3.185 0.001502 ** 
## EnvironmentSatisfaction3 -0.1178138  0.0336459  -3.502 0.000486 ***
## EnvironmentSatisfaction4 -0.1086244  0.0335426  -3.238 0.001248 ** 
## DistanceFromHome          0.0029613  0.0014275   2.075 0.038329 *  
## NumCompaniesWorked        0.0077692  0.0046150   1.683 0.092647 .  
## JobInvolvement2          -0.2500019  0.0547267  -4.568 5.64e-06 ***
## JobInvolvement3          -0.3133234  0.0520614  -6.018 2.61e-09 ***
## JobInvolvement4          -0.3557686  0.0627190  -5.672 1.93e-08 ***
## OTNum                     0.2204916  0.0256488   8.597  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3403 on 856 degrees of freedom
## Multiple R-squared:  0.1562, Adjusted R-squared:  0.1434 
## F-statistic: 12.19 on 13 and 856 DF,  p-value: < 2.2e-16
car::vif(model2)
##                             GVIF Df GVIF^(1/(2*Df))
## YearsSinceLastPromotion 1.013628  1        1.006791
## JobSatisfaction         1.026316  3        1.004339
## EnvironmentSatisfaction 1.032081  3        1.005277
## DistanceFromHome        1.012403  1        1.006183
## NumCompaniesWorked      1.015355  1        1.007648
## JobInvolvement          1.023754  3        1.003920
## OTNum                   1.016974  1        1.008451
plot(model2)

mod2resid = rstudent(model2)
hist(mod2resid,col="blue",main = "Histogram of residuals of final model",ylab = NULL,xlab = NULL)

Classifications

knn (classify attrition as yes) noattritdf=testdata

set.seed(123)
perc=.8
numks = 5

iterations=100
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSensitivity = matrix(nrow = iterations)
masterSpecificity = matrix(nrow = iterations)

for(j in 1:iterations)
{
 accs = data.frame(accuracy=numeric(numks),k=numeric(numks))
 trainIndices = sample(1:dim(attritdf)[1],
 round(perc * dim(attritdf)[1]))
 train = attritdf[trainIndices,]
 test = attritdf[-trainIndices,]

  for(i in 1:numks)
  {
  classifications = knn(train[,c(2,12,15,16,18,22,25)],test[,c(2,12,15,16,18,22,25)],train$AttrNum,prob = TRUE, k = i)
  table(classifications,test$AttrNum)
  CM = confusionMatrix(table(classifications,test$AttrNum))
  masterAcc[j,i] = CM$overall[1]
  masterSensitivity[j]=CM$byClass[1]
  masterSpecificity[j]=CM$byClass[2]
  }
}
ptitle=print(paste0(numks," values of K for ",iterations," iterations."))
## [1] "5 values of K for 100 iterations."
MeanAcc = colMeans(masterAcc)
MeanSpec = mean(masterSpecificity)
MeanSens = mean(masterSensitivity)
plot(seq(1,numks,1),MeanAcc, type = "l",xlab = "Values of K",ylab = "Accuracy",main = ptitle)

which.max(MeanAcc)
## [1] 5
CM
## Confusion Matrix and Statistics
## 
##                
## classifications   0   1
##               0 142  23
##               1   5   4
##                                           
##                Accuracy : 0.8391          
##                  95% CI : (0.7759, 0.8903)
##     No Information Rate : 0.8448          
##     P-Value [Acc > NIR] : 0.631416        
##                                           
##                   Kappa : 0.1568          
##                                           
##  Mcnemar's Test P-Value : 0.001315        
##                                           
##             Sensitivity : 0.9660          
##             Specificity : 0.1481          
##          Pos Pred Value : 0.8606          
##          Neg Pred Value : 0.4444          
##              Prevalence : 0.8448          
##          Detection Rate : 0.8161          
##    Detection Prevalence : 0.9483          
##       Balanced Accuracy : 0.5571          
##                                           
##        'Positive' Class : 0               
## 

Naive Bayse Classification

set.seed(123)
perc=.8
numks = 5
vars = c("")
iterations=20
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSensitivity = matrix(nrow = iterations)
masterSpecificity = matrix(nrow = iterations)
for(i in 1:iterations)
  {
  rn=sample(1:30,1)
  trainIndices = sample(1:dim(attritdf)[1],round(perc * dim(attritdf)[1]))
    train = attritdf[trainIndices,]
    test = attritdf[-trainIndices,]
  model = naiveBayes(train[,c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)],as.factor(train$Attrition),laplace = 1)
  table(predict(model,test[,c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)]),as.factor(test$Attrition))
  
  CM = confusionMatrix(table(predict(model,test[,c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)]),as.factor(test$Attrition)))
  masterAcc[j]=CM$overall[1]
  masterSensitivity[i]=CM$byClass[1]
  masterSpecificity[i]=CM$byClass[1]
}
# Adjusted for noattritiondf
# c(2,5,11,14,15,16,17,19,23,24,26,28,29,30,32,33)

tmpdf <- attritdf[c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)]
tmpdf2 <- noattritdf[c(2,5,11,14,15,16,17,19,23,24,26,28,29,30,32,33)]
attritOut <- predict(model,tmpdf2)
myout=cbind.data.frame(noattritdf$ID,attritOut)
colnames(myout) <- c("ID","Attrition")

write.csv(myout, file = "/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/Case2PredictionsMalcolmCarlsonAttrition.csv",row.names = FALSE)
plot(attritOut,  col="darkgreen", main = "Plot of Attrition Predictions")

CM
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  125  12
##   Yes  14  23
##                                        
##                Accuracy : 0.8506       
##                  95% CI : (0.7888, 0.9)
##     No Information Rate : 0.7989       
##     P-Value [Acc > NIR] : 0.05026      
##                                        
##                   Kappa : 0.5448       
##                                        
##  Mcnemar's Test P-Value : 0.84452      
##                                        
##             Sensitivity : 0.8993       
##             Specificity : 0.6571       
##          Pos Pred Value : 0.9124       
##          Neg Pred Value : 0.6216       
##              Prevalence : 0.7989       
##          Detection Rate : 0.7184       
##    Detection Prevalence : 0.7874       
##       Balanced Accuracy : 0.7782       
##                                        
##        'Positive' Class : No           
## 

Regression analysis of salary

set.seed(123)
training.samples <- attritdf$MonthlyIncome %>%
createDataPartition(p=0.8, list = FALSE)
train.data <- attritdf[training.samples, ]
test.data <- attritdf[-training.samples, ]

modelSalary<-lm(MonthlyIncome~JobLevel+JobRole+
TotalWorkingYears,data=attritdf)

pairs(~MonthlyIncome+JobLevel+TotalWorkingYears,data=attritdf)

plot(modelSalary)

car::vif(modelSalary)
##                        GVIF Df GVIF^(1/(2*Df))
## JobLevel          15.167434  4        1.404798
## JobRole            9.813688  8        1.153425
## TotalWorkingYears  2.918135  1        1.708255
mod1resid = rstudent(modelSalary)
hist(mod1resid,col="darkgreen",main = "Histogram of residuals for model",ylab = NULL,xlab = NULL)

predictions <- modelSalary %>% predict(test.data)
data.frame( R2 = R2(predictions, test.data$MonthlyIncome),
           RMSE = RMSE(predictions, test.data$MonthlyIncome),
            MAE = MAE(predictions, test.data$MonthlyIncome))
##         R2     RMSE      MAE
## 1 0.947316 1034.706 754.5801
summary(modelSalary)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears, 
##     data = attritdf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3184.4  -622.9   -83.6   623.4  4282.9 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3620.912    180.824  20.025  < 2e-16 ***
## JobLevel2                      1687.533    139.069  12.134  < 2e-16 ***
## JobLevel3                      4913.569    186.878  26.293  < 2e-16 ***
## JobLevel4                      8246.787    282.379  29.205  < 2e-16 ***
## JobLevel5                     11003.473    332.523  33.091  < 2e-16 ***
## JobRoleHuman Resources        -1106.211    252.764  -4.376 1.36e-05 ***
## JobRoleLaboratory Technician  -1243.528    175.324  -7.093 2.75e-12 ***
## JobRoleManager                 3332.134    237.808  14.012  < 2e-16 ***
## JobRoleManufacturing Director   134.719    158.506   0.850    0.396    
## JobRoleResearch Director       3479.341    211.496  16.451  < 2e-16 ***
## JobRoleResearch Scientist     -1040.524    178.715  -5.822 8.21e-09 ***
## JobRoleSales Executive          -13.146    136.850  -0.096    0.923    
## JobRoleSales Representative   -1267.454    220.601  -5.745 1.27e-08 ***
## TotalWorkingYears                45.975      7.768   5.918 4.70e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1007 on 856 degrees of freedom
## Multiple R-squared:  0.9527, Adjusted R-squared:  0.952 
## F-statistic:  1327 on 13 and 856 DF,  p-value: < 2.2e-16
salaryPred <- modelSalary%>% predict(nosalarydf)
myout <- cbind.data.frame(nosalarydf$ID,salaryPred)
colnames(myout) <- c("ID","MonthlyIncome")
write.csv(myout, file = "/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/Case2PredictionsMalcolmCarlsonSalary.csv",row.names = FALSE)

Stepwise model

attritdf2 <- attritdf%>%dplyr::select(Age,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager)

# Fit the full model
set.seed(123)
full.model <- lm(MonthlyIncome ~., data = attritdf2)
# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE)

pairs(~MonthlyIncome+JobLevel+TotalWorkingYears,data=attritdf)

plot(step.model)

car::vif(step.model)
##                        GVIF Df GVIF^(1/(2*Df))
## DailyRate          1.011976  1        1.005970
## JobLevel          15.236178  4        1.405593
## JobRole            9.868255  8        1.153825
## TotalWorkingYears  2.918173  1        1.708266
step.modelid = rstudent(step.model)
hist(step.modelid,col="darkgreen",main = "Histogram of residuals for model",ylab = NULL,xlab = NULL)

predictions <- step.model %>% predict(test.data)
data.frame( R2 = R2(predictions, test.data$MonthlyIncome),
           RMSE = RMSE(predictions, test.data$MonthlyIncome),
            MAE = MAE(predictions, test.data$MonthlyIncome))
##          R2     RMSE     MAE
## 1 0.9475298 1033.253 750.574
summary(step.model)
## 
## Call:
## lm(formula = MonthlyIncome ~ DailyRate + JobLevel + JobRole + 
##     TotalWorkingYears, data = attritdf2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3256.9  -624.8   -84.8   596.6  4173.6 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.478e+03  1.939e+02  17.934  < 2e-16 ***
## DailyRate                      1.728e-01  8.554e-02   2.020   0.0437 *  
## JobLevel2                      1.681e+03  1.389e+02  12.103  < 2e-16 ***
## JobLevel3                      4.907e+03  1.866e+02  26.303  < 2e-16 ***
## JobLevel4                      8.246e+03  2.819e+02  29.253  < 2e-16 ***
## JobLevel5                      1.097e+04  3.322e+02  33.034  < 2e-16 ***
## JobRoleHuman Resources        -1.101e+03  2.523e+02  -4.362 1.45e-05 ***
## JobRoleLaboratory Technician  -1.236e+03  1.750e+02  -7.060 3.44e-12 ***
## JobRoleManager                 3.367e+03  2.380e+02  14.146  < 2e-16 ***
## JobRoleManufacturing Director  1.410e+02  1.583e+02   0.891   0.3732    
## JobRoleResearch Director       3.497e+03  2.113e+02  16.551  < 2e-16 ***
## JobRoleResearch Scientist     -1.035e+03  1.784e+02  -5.802 9.23e-09 ***
## JobRoleSales Executive        -8.351e+00  1.366e+02  -0.061   0.9513    
## JobRoleSales Representative   -1.263e+03  2.202e+02  -5.734 1.36e-08 ***
## TotalWorkingYears              4.592e+01  7.754e+00   5.921 4.62e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1005 on 855 degrees of freedom
## Multiple R-squared:  0.9529, Adjusted R-squared:  0.9522 
## F-statistic:  1237 on 14 and 855 DF,  p-value: < 2.2e-16
car::vif(step.model)
##                        GVIF Df GVIF^(1/(2*Df))
## DailyRate          1.011976  1        1.005970
## JobLevel          15.236178  4        1.405593
## JobRole            9.868255  8        1.153825
## TotalWorkingYears  2.918173  1        1.708266
salaryPred <- step.model%>% predict(nosalarydf)
myout <- cbind.data.frame(nosalarydf$ID,salaryPred)
colnames(myout) <- c("ID","MonthlyIncome")
write.csv(myout, file = "/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/Case2PredictionsMalcolmCarlsonSalary.csv",row.names = FALSE)